1 Data Preprocessing, Import, and Cleaning

# Install packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("randomForest")
## 
## The downloaded binary packages are in
##  /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("caret")
## 
## The downloaded binary packages are in
##  /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("xgboost")
## 
## The downloaded binary packages are in
##  /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
install.packages("Metrics")
## 
## The downloaded binary packages are in
##  /var/folders/7n/r8t0b1p105jcq269y7gkfjcc0000gn/T//Rtmp6KsbPA/downloaded_packages
library(xgboost)
library(randomForest)
library(janitor)
library(tidyverse)
library(dplyr)
library(mice)
library(naniar)
library(Metrics)
library(rsample)
# Read csv file "college_data.csv", and import it as college_df
college_df <- read.csv("college_data.csv")
# Remove all unweighted columns and redundant tier name columns
college_df <- college_df %>%
  select(-matches("unwgt")) %>%
  select(-c("tier_name", "test_band_tier", "flagship"))

# Create new df for public schools
public_df <- college_df %>%
  filter(public == "Public")

# Create new df for private schools and remove all unnecessary columns that are not applicable to private schools. 
private_df <- college_df %>%
  filter(public == "Private") %>%
  select(-matches("oostate|instate" ))
# Visualize missing values 
vis_miss(private_df)

vis_miss(public_df)

# Remove rows with more than 4 missing values for both data sets
private_df <- private_df %>%
  filter(rowSums(is.na(.)) < 4)

public_df <- public_df %>%
  filter(rowSums(is.na(.)) < 4)
# Utilize MICE method to impute other missing values 
imputed_values <- mice(data = private_df, # Created imputed values
                        m = 1, # Set # of imputations
                        maxit = 40, # Set max # iterations
                        method = "cart", # Method as "cart"
                        print = FALSE) 
## Warning: Number of logged events: 84
private_df<- complete(imputed_values, 1)
imputed_values_public <- mice(data = public_df, # Created imputed values
                        m = 1, # Set number of  imputations
                        maxit = 40, # Set max # iterations
                        method = "cart", # Method as "cart"
                        print = FALSE) 
## Warning: Number of logged events: 244
public_df<- complete(imputed_values_public, 1)
# Visualize the missing values again to see that there are no more missing values.
vis_miss(private_df)

vis_miss(public_df)

# Remove the public column, as now we have separated the two data frames, there is no need for this column.
public_df <- public_df %>%
  select(-public)

private_df <- private_df %>%
  select(-public)
# Make tier and name columns into factors
public_df <- public_df %>%
  mutate(
    tier = as.factor(tier),
    name = as.factor(name)
  )

2 Modeling

2.1 Splitting into Testing and Training

set.seed(111111)

# Split private_df into training and testing sets by 80/20 ratio.
split <- initial_split(private_df, prop = 0.8)

# Training set
private_train_df <- training(split)  
# Test set
private_test_df <- testing(split)  

# View dimensions of the different private and test set
cat("Training data size: ", nrow(private_train_df), "\n")  
## Training data size:  844
cat("Test data size: ", nrow(private_test_df), "\n")  
## Test data size:  212
# Repeat same process for public school df
set.seed(111111)  

split <- initial_split(public_df, prop = 0.8)  
public_train_df <- training(split)  
public_test_df <- testing(split)  

cat("Training data size: ", nrow(public_train_df), "\n")  
## Training data size:  489
cat("Test data size: ", nrow(public_test_df), "\n")  
## Test data size:  123

2.2 Linear Regression

# Linear regression madel, don't include columns name and super_opeid. These are not relevant for a lm
public_fit <- lm(rel_apply ~ . - name - super_opeid, data = public_df)
summary(public_fit)
## 
## Call:
## lm(formula = rel_apply ~ . - name - super_opeid, data = public_df)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.138679 -0.013715 -0.000181  0.013131  0.123546 
## 
## Coefficients: (1 not defined because of singularities)
##                                                Estimate Std. Error t value
## (Intercept)                                  -3.996e-03  3.743e-02  -0.107
## par_income_bin                               -6.555e-06  1.425e-04  -0.046
## par_income_lab20-40                           3.382e-03  7.440e-03   0.455
## par_income_lab40-60                           1.367e-03  1.099e-02   0.124
## par_income_lab60-70                           6.745e-03  1.155e-02   0.584
## par_income_lab70-80                           4.947e-03  1.367e-02   0.362
## par_income_lab80-90                          -4.204e-03  1.594e-02  -0.264
## par_income_lab90-95                          -1.583e-02  1.506e-02  -1.051
## par_income_lab95-96                          -5.276e-03  7.247e-03  -0.728
## par_income_lab96-97                          -8.968e-03  7.030e-03  -1.276
## par_income_lab97-98                          -8.733e-03  6.904e-03  -1.265
## par_income_lab98-99                          -1.205e-02  6.313e-03  -1.909
## par_income_labTop 1                                  NA         NA      NA
## attend                                        5.192e-01  2.904e+00   0.179
## stderr_attend                                 4.130e+01  7.538e+01   0.548
## attend_level                                 -1.181e+00  3.027e+00  -0.390
## attend_sat                                    7.443e+00  3.148e+00   2.365
## stderr_attend_sat                            -8.530e+01  5.020e+01  -1.699
## attend_level_sat                             -3.347e+00  3.159e+00  -1.059
## stderr_rel_apply                              5.670e+00  9.934e-01   5.707
## rel_attend                                    6.424e-01  2.273e-02  28.265
## stderr_rel_attend                            -6.919e-02  3.536e-01  -0.196
## rel_att_cond_app                             -6.439e-01  3.325e-02 -19.368
## rel_apply_sat                                 4.703e-01  2.074e-02  22.678
## stderr_rel_apply_sat                         -2.801e+00  4.575e-01  -6.124
## rel_attend_sat                               -2.065e-01  1.781e-02 -11.596
## stderr_rel_attend_sat                         7.218e-01  1.172e-01   6.158
## rel_att_cond_app_sat                          1.300e-01  1.261e-02  10.305
## attend_instate                               -9.824e-02  1.300e-01  -0.756
## stderr_attend_instate                        -3.215e+00  1.029e+00  -3.125
## attend_level_instate                          2.092e-01  1.424e-01   1.469
## attend_instate_sat                           -7.028e-02  1.057e-01  -0.665
## stderr_attend_instate_sat                     1.476e+00  4.906e-01   3.009
## attend_level_instate_sat                     -3.073e-02  1.028e-01  -0.299
## attend_oostate                                3.972e+00  3.960e+00   1.003
## stderr_attend_oostate                        -9.077e+01  1.057e+02  -0.859
## attend_level_oostate                         -7.199e+00  6.925e+00  -1.039
## attend_oostate_sat                           -1.076e+01  4.259e+00  -2.527
## stderr_attend_oostate_sat                     9.686e+01  5.894e+01   1.643
## attend_level_oostate_sat                      1.641e+01  8.694e+00   1.887
## rel_apply_instate                             6.641e-01  5.429e-02  12.233
## stderr_rel_apply_instate                     -1.006e+00  5.185e-01  -1.939
## rel_attend_instate                           -4.963e-01  5.653e-02  -8.779
## stderr_rel_attend_instate                    -7.404e-02  2.606e-01  -0.284
## rel_att_cond_app_instate                      4.853e-01  5.367e-02   9.042
## rel_apply_oostate                             2.231e-01  2.058e-02  10.841
## stderr_rel_apply_oostate                     -1.122e+00  2.801e-01  -4.003
## rel_attend_oostate                           -1.012e-01  1.172e-02  -8.636
## stderr_rel_attend_oostate                     3.099e-02  3.991e-02   0.777
## rel_att_cond_app_oostate                      1.108e-01  1.604e-02   6.908
## rel_apply_instate_sat                        -2.043e-01  3.403e-02  -6.003
## stderr_rel_apply_instate_sat                  6.042e-01  2.183e-01   2.768
## rel_attend_instate_sat                        1.012e-01  3.002e-02   3.371
## stderr_rel_attend_instate_sat                -3.212e-01  1.025e-01  -3.133
## rel_att_cond_app_instate_sat                 -6.488e-02  2.592e-02  -2.503
## rel_apply_oostate_sat                        -1.217e-01  1.602e-02  -7.596
## stderr_rel_apply_oostate_sat                  4.202e-01  1.286e-01   3.269
## rel_attend_oostate_sat                        2.515e-02  6.695e-03   3.756
## stderr_rel_attend_oostate_sat                -4.261e-02  1.603e-02  -2.659
## rel_att_cond_app_oostate_sat                 -1.084e-02  8.971e-03  -1.208
## tierOther elite schools (public and private) -1.121e-02  5.084e-03  -2.205
## tierSelective public                          3.872e-04  3.648e-03   0.106
##                                              Pr(>|t|)    
## (Intercept)                                  0.915023    
## par_income_bin                               0.963327    
## par_income_lab20-40                          0.649630    
## par_income_lab40-60                          0.901013    
## par_income_lab60-70                          0.559404    
## par_income_lab70-80                          0.717517    
## par_income_lab80-90                          0.792123    
## par_income_lab90-95                          0.293564    
## par_income_lab95-96                          0.466879    
## par_income_lab96-97                          0.202645    
## par_income_lab97-98                          0.206454    
## par_income_lab98-99                          0.056768 .  
## par_income_labTop 1                                NA    
## attend                                       0.858144    
## stderr_attend                                0.584035    
## attend_level                                 0.696537    
## attend_sat                                   0.018397 *  
## stderr_attend_sat                            0.089868 .  
## attend_level_sat                             0.289849    
## stderr_rel_apply                             1.88e-08 ***
## rel_attend                                    < 2e-16 ***
## stderr_rel_attend                            0.844939    
## rel_att_cond_app                              < 2e-16 ***
## rel_apply_sat                                 < 2e-16 ***
## stderr_rel_apply_sat                         1.74e-09 ***
## rel_attend_sat                                < 2e-16 ***
## stderr_rel_attend_sat                        1.42e-09 ***
## rel_att_cond_app_sat                          < 2e-16 ***
## attend_instate                               0.450140    
## stderr_attend_instate                        0.001872 ** 
## attend_level_instate                         0.142489    
## attend_instate_sat                           0.506255    
## stderr_attend_instate_sat                    0.002738 ** 
## attend_level_instate_sat                     0.765205    
## attend_oostate                               0.316257    
## stderr_attend_oostate                        0.390661    
## attend_level_oostate                         0.299037    
## attend_oostate_sat                           0.011770 *  
## stderr_attend_oostate_sat                    0.100906    
## attend_level_oostate_sat                     0.059654 .  
## rel_apply_instate                             < 2e-16 ***
## stderr_rel_apply_instate                     0.052960 .  
## rel_attend_instate                            < 2e-16 ***
## stderr_rel_attend_instate                    0.776436    
## rel_att_cond_app_instate                      < 2e-16 ***
## rel_apply_oostate                             < 2e-16 ***
## stderr_rel_apply_oostate                     7.10e-05 ***
## rel_attend_oostate                            < 2e-16 ***
## stderr_rel_attend_oostate                    0.437758    
## rel_att_cond_app_oostate                     1.36e-11 ***
## rel_apply_instate_sat                        3.51e-09 ***
## stderr_rel_apply_instate_sat                 0.005832 ** 
## rel_attend_instate_sat                       0.000802 ***
## stderr_rel_attend_instate_sat                0.001825 ** 
## rel_att_cond_app_instate_sat                 0.012586 *  
## rel_apply_oostate_sat                        1.32e-13 ***
## stderr_rel_apply_oostate_sat                 0.001148 ** 
## rel_attend_oostate_sat                       0.000191 ***
## stderr_rel_attend_oostate_sat                0.008070 ** 
## rel_att_cond_app_oostate_sat                 0.227521    
## tierOther elite schools (public and private) 0.027844 *  
## tierSelective public                         0.915504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02869 on 551 degrees of freedom
## Multiple R-squared:  0.9938, Adjusted R-squared:  0.9932 
## F-statistic:  1483 on 60 and 551 DF,  p-value: < 2.2e-16
# Repeat for private schools
private_fit <- lm(rel_apply ~ . - name - super_opeid, data = private_df)
summary(private_fit)
## 
## Call:
## lm(formula = rel_apply ~ . - name - super_opeid, data = private_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.49347 -0.02754 -0.00059  0.02724  0.26238 
## 
## Coefficients: (1 not defined because of singularities)
##                                                Estimate Std. Error t value
## (Intercept)                                   3.844e-01  2.198e-02  17.487
## par_income_bin                                5.362e-04  1.490e-04   3.599
## par_income_lab20-40                           2.345e-02  8.626e-03   2.719
## par_income_lab40-60                           3.882e-02  9.440e-03   4.113
## par_income_lab60-70                           3.266e-02  1.006e-02   3.247
## par_income_lab70-80                           3.806e-02  1.082e-02   3.519
## par_income_lab80-90                           3.999e-02  1.149e-02   3.479
## par_income_lab90-95                           3.481e-02  1.140e-02   3.053
## par_income_lab95-96                           2.475e-02  1.094e-02   2.263
## par_income_lab96-97                           4.308e-02  1.064e-02   4.050
## par_income_lab97-98                           3.408e-02  1.010e-02   3.375
## par_income_lab98-99                           3.685e-02  9.521e-03   3.871
## par_income_labTop 1                                  NA         NA      NA
## attend                                        7.277e+00  2.441e+00   2.981
## stderr_attend                                -5.889e+01  1.825e+01  -3.227
## attend_level                                 -4.902e+00  2.709e+00  -1.809
## attend_sat                                   -4.764e+00  1.760e+00  -2.707
## stderr_attend_sat                             6.850e+01  1.497e+01   4.577
## attend_level_sat                              1.561e+00  2.204e+00   0.708
## stderr_rel_apply                              3.358e+00  4.402e-01   7.629
## rel_attend                                    4.414e-01  9.998e-03  44.146
## stderr_rel_attend                            -3.286e-03  1.318e-01  -0.025
## rel_att_cond_app                             -4.522e-01  1.935e-02 -23.369
## rel_apply_sat                                 5.281e-01  1.550e-02  34.078
## stderr_rel_apply_sat                         -1.290e+00  1.663e-01  -7.757
## rel_attend_sat                               -1.336e-01  1.021e-02 -13.082
## stderr_rel_attend_sat                         3.002e-04  3.929e-02   0.008
## rel_att_cond_app_sat                          1.179e-01  1.125e-02  10.474
## tierIvy Plus                                 -1.888e-02  1.243e-02  -1.519
## tierOther elite schools (public and private) -9.060e-03  6.045e-03  -1.499
## tierSelective private                         1.507e-02  1.071e-02   1.407
##                                              Pr(>|t|)    
## (Intercept)                                   < 2e-16 ***
## par_income_bin                               0.000334 ***
## par_income_lab20-40                          0.006660 ** 
## par_income_lab40-60                          4.22e-05 ***
## par_income_lab60-70                          0.001202 ** 
## par_income_lab70-80                          0.000452 ***
## par_income_lab80-90                          0.000524 ***
## par_income_lab90-95                          0.002321 ** 
## par_income_lab95-96                          0.023851 *  
## par_income_lab96-97                          5.51e-05 ***
## par_income_lab97-98                          0.000767 ***
## par_income_lab98-99                          0.000115 ***
## par_income_labTop 1                                NA    
## attend                                       0.002944 ** 
## stderr_attend                                0.001292 ** 
## attend_level                                 0.070673 .  
## attend_sat                                   0.006906 ** 
## stderr_attend_sat                            5.29e-06 ***
## attend_level_sat                             0.478883    
## stderr_rel_apply                             5.39e-14 ***
## rel_attend                                    < 2e-16 ***
## stderr_rel_attend                            0.980113    
## rel_att_cond_app                              < 2e-16 ***
## rel_apply_sat                                 < 2e-16 ***
## stderr_rel_apply_sat                         2.10e-14 ***
## rel_attend_sat                                < 2e-16 ***
## stderr_rel_attend_sat                        0.993905    
## rel_att_cond_app_sat                          < 2e-16 ***
## tierIvy Plus                                 0.129073    
## tierOther elite schools (public and private) 0.134257    
## tierSelective private                        0.159835    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06017 on 1026 degrees of freedom
## Multiple R-squared:  0.9837, Adjusted R-squared:  0.9833 
## F-statistic:  2138 on 29 and 1026 DF,  p-value: < 2.2e-16

2.3 Random Forest

# Bagging Model
set.seed(111111) 
bag_mod_priv <- randomForest(rel_apply ~., 
                data = private_train_df, # Select df.
                mtry = 23, # Set mtry to # of variables
                ntree = 200) # Number of trees wanted
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
bag_mod_priv 
## 
## Call:
##  randomForest(formula = rel_apply ~ ., data = private_train_df,      mtry = 23, ntree = 200) 
##                Type of random forest: regression
##                      Number of trees: 200
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.008951499
##                     % Var explained: 95.75
bag_preds_priv <- predict(bag_mod_priv, private_test_df) # Create the predictions from the model

rmse <- sqrt(mean((bag_preds_priv - private_test_df$rel_apply)^2)) # See the RMSE from model
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.100328598409104"
# Repeat for public schools
set.seed(111111) 
bag_mod_pub <- randomForest(rel_apply ~.,
                data = public_train_df, 
                mtry = 53, 
                ntree = 200) 
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
bag_mod_pub 
## 
## Call:
##  randomForest(formula = rel_apply ~ ., data = public_train_df,      mtry = 53, ntree = 200) 
##                Type of random forest: regression
##                      Number of trees: 200
## No. of variables tried at each split: 52
## 
##           Mean of squared residuals: 0.008252995
##                     % Var explained: 92.92
bag_preds_pub <- predict(bag_mod_pub, public_test_df) # Create predictions from this bagging model

rmse <- sqrt(mean((bag_preds_pub - public_test_df$rel_apply)^2))
print(paste("RMSE:", rmse))
## [1] "RMSE: 0.113082129428566"

2.4 XGBoost

2.4.1 Cleaning for XGBoost

# Preparation for XG Boost

# Remove the name and par_income_lab column (this one is redundant with par_income_bin), and make sure that "par_income_bin" is of numeric type
xg_private_train_df <- private_train_df[ , !colnames(private_train_df) %in% c("name", "par_income_lab")]
xg_private_test_df <- private_test_df[ , !colnames(private_test_df) %in% c("name", "par_income_lab")]
xg_public_train_df <- public_train_df[ , !colnames(public_train_df) %in% c("name", "par_income_lab")]
xg_public_test_df <- public_test_df[ , !colnames(public_test_df) %in% c("name", "par_income_lab")]

# Convert 'par_income_bin' to numeric in all dfs
xg_private_train_df$par_income_bin <- as.numeric(xg_private_train_df$par_income_bin)
xg_private_test_df$par_income_bin <- as.numeric(xg_private_test_df$par_income_bin)
xg_public_train_df$par_income_bin <- as.numeric(xg_public_train_df$par_income_bin)
xg_public_test_df$par_income_bin <- as.numeric(xg_public_test_df$par_income_bin)

# Double check that it is numeric.
str(xg_private_train_df$par_income_bin)
##  num [1:844] 10 98.5 30 85 99.5 92.5 92.5 75 50 10 ...
str(xg_private_test_df$par_income_bin)
##  num [1:212] 30 96.5 97.5 50 75 50 75 97.5 75 92.5 ...
str(xg_public_train_df$par_income_bin)
##  num [1:489] 98.5 85 10 30 85 98.5 92.5 99.5 75 92.5 ...
str(xg_public_test_df$par_income_bin)
##  num [1:123] 95.5 96.5 97.5 50 30 50 95.5 75 65 95.5 ...
# See levels of tier
print(unique(xg_private_train_df$tier))
## [1] "Highly selective private"                
## [2] "Other elite schools (public and private)"
## [3] "Ivy Plus"                                
## [4] "Selective private"
print(unique(xg_public_train_df$tier))
## [1] Selective public                        
## [2] Highly selective public                 
## [3] Other elite schools (public and private)
## 3 Levels: Highly selective public ... Selective public
# Set tier levels
xg_private_train_df$tier <- as.numeric(factor(xg_private_train_df$tier, 
                                 levels = c("Selective private", "Highly selective private", "Other elite schools (public and private)", "Ivy Plus")))
xg_private_test_df$tier <- as.numeric(factor(xg_private_test_df$tier, 
                                 levels = c("Selective private", "Highly selective private", "Other elite schools (public and private)", "Ivy Plus")))

# Same for public schools
xg_public_train_df$tier <- as.numeric(factor(xg_public_train_df$tier, 
                                 levels = c("Selective public", "Highly selective public", "Other elite schools (public and private)")))
xg_public_test_df$tier <- as.numeric(factor(xg_public_test_df$tier, 
                                 levels = c("Selective public", "Highly selective public", "Other elite schools (public and private)")))
# Set matrices for xgboost analysis for private schools
dtrain_private <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)

2.4.2 Training ETA for Private Schools

# Use xgb.cv for cross validation analysis in xgb
set.seed(111111)
private_bst <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.1, # Learning rate
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.551040+0.005750    test-rmse:0.552299+0.024995 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.087554+0.001209    test-rmse:0.123863+0.025436 
## [41] train-rmse:0.025785+0.000375    test-rmse:0.087308+0.020835 
## [61] train-rmse:0.014847+0.000366    test-rmse:0.083222+0.018624 
## [81] train-rmse:0.010022+0.000500    test-rmse:0.081936+0.018388 
## [100]    train-rmse:0.007236+0.000482    test-rmse:0.081302+0.018247
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.3
set.seed(111111)
private_bst_mod_1 <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.3, # Learning rate as 0.3
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.436080+0.004902    test-rmse:0.441538+0.026091 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.015118+0.000486    test-rmse:0.084970+0.019505 
## [41] train-rmse:0.005549+0.000403    test-rmse:0.082750+0.019216 
## [61] train-rmse:0.002339+0.000177    test-rmse:0.082478+0.019133 
## [81] train-rmse:0.001116+0.000133    test-rmse:0.082346+0.019166 
## [100]    train-rmse:0.000871+0.000036    test-rmse:0.082332+0.019176
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.1
set.seed(111111)
private_bst_mod_2 <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.1, # Learning rate as 0.1
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.551040+0.005750    test-rmse:0.552299+0.024995 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.087554+0.001209    test-rmse:0.123863+0.025436 
## [41] train-rmse:0.025785+0.000375    test-rmse:0.087308+0.020835 
## [61] train-rmse:0.014847+0.000366    test-rmse:0.083222+0.018624 
## [81] train-rmse:0.010022+0.000500    test-rmse:0.081936+0.018388 
## [100]    train-rmse:0.007236+0.000482    test-rmse:0.081302+0.018247
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.05
set.seed(111111)
private_bst_mod_3 <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.05, # Learning rate as 0.05
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.579931+0.005976    test-rmse:0.580308+0.024906 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.225495+0.002840    test-rmse:0.241696+0.025724 
## [41] train-rmse:0.094727+0.001395    test-rmse:0.129109+0.025518 
## [61] train-rmse:0.045555+0.000584    test-rmse:0.097395+0.024107 
## [81] train-rmse:0.026917+0.000420    test-rmse:0.087689+0.021002 
## [100]    train-rmse:0.019629+0.000359    test-rmse:0.084678+0.019389
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.01
set.seed(111111)
private_bst_mod_4 <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.01, # Learning rate as 0.01
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.603075+0.006160    test-rmse:0.602782+0.024885 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.498657+0.005325    test-rmse:0.501702+0.025309 
## [41] train-rmse:0.413151+0.004626    test-rmse:0.419514+0.026152 
## [61] train-rmse:0.343005+0.003978    test-rmse:0.352274+0.026476 
## [81] train-rmse:0.285430+0.003433    test-rmse:0.298185+0.026535 
## [100]    train-rmse:0.240257+0.002995    test-rmse:0.256018+0.026229
# Use xgb.cv for cross validation analysis in xgb, learning rate 0.005
set.seed(111111)
private_bst_mod_5 <- xgb.cv(data = dtrain_private, # Training data
              nfold = 5, # 5 fold cv
               eta = 0.005, # Learning rate as 0.005
               nrounds = 100, # 100 rounds
               early_stopping_rounds = 100, # The number of rounds to stop at if there is no improvement
          
               verbose = 1, # Print out fit
              
               nthread = 1, # Number of parallel threads
               print_every_n = 20, # Results every 20th iter
        
               objective = "reg:squarederror",  # Make this into regression
              eval_metric = "rmse") # See RMSE
## [1]  train-rmse:0.605970+0.006183    test-rmse:0.605595+0.024885 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.551000+0.005734    test-rmse:0.552274+0.025044 
## [41] train-rmse:0.501256+0.005347    test-rmse:0.504200+0.025266 
## [61] train-rmse:0.456236+0.004991    test-rmse:0.460913+0.025728 
## [81] train-rmse:0.415473+0.004649    test-rmse:0.421781+0.026121 
## [100]    train-rmse:0.380292+0.004331    test-rmse:0.387905+0.026234
# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(private_bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(private_bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(private_bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(private_bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(private_bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(private_bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(private_bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(private_bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(private_bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(private_bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Converty ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_1 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_point(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate")  # Set labels
g_1

# Plot lines
g_2 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_smooth(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate") 
g_2
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Plot lines
g_3 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_smooth(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate") + # Set labels
      xlim(0, 100) +  # Zoom in on the x-axis (number of trees)
      ylim(0, 0.1)    # Zoom in on the y-axis (error rate)
g_3
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 293 rows containing non-finite values (`stat_smooth()`).

private_bst_mod_final <- xgboost(data = dtrain_private, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [23:52:05] WARNING: src/learner.cc:767: 
## Parameters: { "nfold" } are not used.
## 
## [1]  train-rmse:0.550939 
## Will train until train_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.086868 
## [41] train-rmse:0.025991 
## [61] train-rmse:0.015625 
## [81] train-rmse:0.011242 
## [100]    train-rmse:0.008119

2.4.3 Private School Feature Importance

2.4.3.1 All Features

# Set matrices for xgboost analysis for private schools
dtrain_private <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 10:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
private_bst_mod_final <- xgboost(data = dtrain_private, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [23:52:06] WARNING: src/learner.cc:767: 
## Parameters: { "nfold" } are not used.
## 
## [1]  train-rmse:0.550939 
## Will train until train_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.086868 
## [41] train-rmse:0.025991 
## [61] train-rmse:0.015625 
## [81] train-rmse:0.011242 
## [100]    train-rmse:0.008119
# XGBoost for private schools
set.seed(111111)
private_bst_1 <- xgboost(data = dtrain_private, # Training data
               nrounds = 100, # # of rounds
               verbose = 1, # Fit
                print_every_n = 20 # Results every 20th iter
 ) 
## [1]  train-rmse:0.435761 
## [21] train-rmse:0.015697 
## [41] train-rmse:0.006346 
## [61] train-rmse:0.003051 
## [81] train-rmse:0.001486 
## [100]    train-rmse:0.000905
# Feature importance for private schools
importance_matrix_private <- xgb.importance( model = private_bst_mod_final)

# Print the feature importances
print(importance_matrix_private)
##                   Feature         Gain       Cover   Frequency
##                    <char>        <num>       <num>       <num>
##  1:         rel_apply_sat 7.397168e-01 0.169789975 0.125720272
##  2:            rel_attend 2.418911e-01 0.366571579 0.190413829
##  3:      rel_att_cond_app 9.441161e-03 0.182131149 0.166579361
##  4:                attend 1.203453e-03 0.026034666 0.071241488
##  5:      stderr_rel_apply 9.455367e-04 0.021140367 0.044525930
##  6:  rel_att_cond_app_sat 8.453134e-04 0.026792937 0.043740178
##  7:        rel_attend_sat 8.332962e-04 0.019323760 0.041644840
##  8:  stderr_rel_apply_sat 8.140242e-04 0.028844731 0.034311158
##  9: stderr_rel_attend_sat 7.718536e-04 0.017949139 0.029858565
## 10:         stderr_attend 6.782930e-04 0.029485409 0.048192771
## 11:        par_income_bin 6.489632e-04 0.040518869 0.058407543
## 12:            attend_sat 5.875982e-04 0.018322192 0.038501833
## 13:     stderr_rel_attend 5.327781e-04 0.019033832 0.033263489
## 14:          attend_level 4.292144e-04 0.008855963 0.020167627
## 15:     stderr_attend_sat 3.312067e-04 0.017097604 0.033263489
## 16:      attend_level_sat 2.667013e-04 0.004667222 0.015191200
## 17:                  tier 6.269296e-05 0.003440606 0.004976427
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_private)

source("~/Downloads/a_insights_shap_functions.r")

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = private_bst_mod_final, 
                X_train =as.matrix(xg_private_train_df[,c(2:8, 10:19)]),
                shap_approx = F)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)

shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(xg_private_train_df[,c(2:8, 10:19)]), 
                           top_n = 10)
## Loading required package: ggforce
plot.shap.summary(data_long = shap_long)

2.4.3.2 Detailed Features

# Set matrices for xgboost analysis for private schools
dtrain_private2 <- xgb.DMatrix(data = as.matrix(xg_private_train_df[,c(2:8, 17:19)]), label = as.numeric(xg_private_train_df$rel_apply) -1)
dtest_private2 <- xgb.DMatrix(data = as.matrix(xg_private_test_df[,c(2:8, 17:19)]), label = as.numeric(xg_private_test_df$rel_apply) - 1)
private_bst_mod_final2 <- xgboost(data = dtrain_private2, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [23:52:07] WARNING: src/learner.cc:767: 
## Parameters: { "nfold" } are not used.
## 
## [1]  train-rmse:0.554705 
## Will train until train_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.118128 
## [41] train-rmse:0.054982 
## [61] train-rmse:0.038942 
## [81] train-rmse:0.029498 
## [100]    train-rmse:0.023104
# Feature importance for private schools
importance_matrix_private <- xgb.importance( model = private_bst_mod_final2)

# Print the feature importances
print(importance_matrix_private)
##                   Feature        Gain      Cover  Frequency
##                    <char>       <num>      <num>      <num>
##  1:        par_income_bin 0.456318412 0.14352442 0.11560430
##  2:            attend_sat 0.135190376 0.13729117 0.11286269
##  3:                attend 0.122619651 0.14487323 0.16952250
##  4: stderr_rel_attend_sat 0.104271406 0.13397901 0.11286269
##  5:          attend_level 0.100552104 0.09488949 0.11034955
##  6:  rel_att_cond_app_sat 0.021067137 0.16861107 0.13068312
##  7:      attend_level_sat 0.019964851 0.05287290 0.06945396
##  8:         stderr_attend 0.019887394 0.05889865 0.08407585
##  9:     stderr_attend_sat 0.011067454 0.04446879 0.06808316
## 10:                  tier 0.009061217 0.02059128 0.02650217
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_private)

source("~/Downloads/a_insights_shap_functions.r")

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = private_bst_mod_final2, 
                X_train =as.matrix(xg_private_train_df[,c(2:8, 17:19)]),
                shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)

shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(xg_private_train_df[,c(2:8, 17:19)]), 
                           top_n = 10)


plot.shap.summary(data_long = shap_long)

2.4.4 Training ETA for Public Schools

# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)
# Use xgb.cv to run cross-validation inside xgboost 
# Repeat for public school df
set.seed(111111)
bst <- xgb.cv(data = dtrain_public, # Set training data
              nfold = 5, # Use 5 fold cross-validation
               eta = 0.1, # Set learning rate
               nrounds = 400, # Set number of rounds
               early_stopping_rounds = 400, # Set number of rounds to stop at if there is no improvement
               verbose = 1, # 1 - Prints out fit
               nthread = 1, # Set number of parallel threads
               print_every_n = 20, # Prints out result every 20th iteration
              
               objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.498638+0.008287    test-rmse:0.498787+0.034812 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 400 rounds.
## 
## [21] train-rmse:0.074538+0.000989    test-rmse:0.106366+0.008186 
## [41] train-rmse:0.016654+0.000267    test-rmse:0.074568+0.007089 
## [61] train-rmse:0.006790+0.000123    test-rmse:0.071566+0.007040 
## [81] train-rmse:0.003968+0.000058    test-rmse:0.070815+0.006861 
## [101]    train-rmse:0.002410+0.000070    test-rmse:0.070635+0.006827 
## [121]    train-rmse:0.001487+0.000104    test-rmse:0.070569+0.006789 
## [141]    train-rmse:0.000967+0.000096    test-rmse:0.070539+0.006788 
## [161]    train-rmse:0.000720+0.000024    test-rmse:0.070526+0.006779 
## [181]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [201]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [221]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [241]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [261]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [281]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [301]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [321]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [341]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [361]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [381]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780 
## [400]    train-rmse:0.000714+0.000026    test-rmse:0.070527+0.006780
set.seed(111111)
public_bst_1 <- xgboost(data = dtrain_public, # Set training data
               nrounds = 100, # Set number of rounds
               verbose = 1, # 1 - Prints out fit
               print_every_n = 20 # Prints out result every 20th iteration
 ) 
## [1]  train-rmse:0.392911 
## [21] train-rmse:0.007654 
## [41] train-rmse:0.001731 
## [61] train-rmse:0.000629 
## [81] train-rmse:0.000629 
## [100]    train-rmse:0.000629
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst <- xgb.cv(data = dtrain_public, # Set training data
              nfold = 5, # Use 5 fold cross-validation
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.498638+0.008287    test-rmse:0.498787+0.034812 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.074538+0.000989    test-rmse:0.106366+0.008186 
## [41] train-rmse:0.016654+0.000267    test-rmse:0.074568+0.007089 
## [61] train-rmse:0.006790+0.000123    test-rmse:0.071566+0.007040 
## [81] train-rmse:0.003968+0.000058    test-rmse:0.070815+0.006861 
## [100]    train-rmse:0.002485+0.000082    test-rmse:0.070653+0.006824
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_1 <- xgb.cv(data = dtrain_public, # Set training data
              nfold = 5, # Use 5 fold cross-validation
              eta = 0.3, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.393587+0.006288    test-rmse:0.397344+0.028556 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.006762+0.000454    test-rmse:0.074262+0.005257 
## [41] train-rmse:0.001389+0.000197    test-rmse:0.073524+0.005366 
## [61] train-rmse:0.000625+0.000038    test-rmse:0.073471+0.005347 
## [81] train-rmse:0.000625+0.000038    test-rmse:0.073471+0.005347 
## [100]    train-rmse:0.000625+0.000038    test-rmse:0.073471+0.005347
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_2 <- xgb.cv(data = dtrain_public, # Set training data
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.498638+0.008287    test-rmse:0.498787+0.034812 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.074538+0.000989    test-rmse:0.106366+0.008186 
## [41] train-rmse:0.016654+0.000267    test-rmse:0.074568+0.007089 
## [61] train-rmse:0.006790+0.000123    test-rmse:0.071566+0.007040 
## [81] train-rmse:0.003968+0.000058    test-rmse:0.070815+0.006861 
## [100]    train-rmse:0.002485+0.000082    test-rmse:0.070653+0.006824
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_3 <- xgb.cv(data = dtrain_public, # Set training data
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.05, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.525009+0.008782    test-rmse:0.524401+0.036329 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.201132+0.002951    test-rmse:0.215460+0.016975 
## [41] train-rmse:0.081311+0.001072    test-rmse:0.112250+0.007407 
## [61] train-rmse:0.035773+0.000357    test-rmse:0.082799+0.006179 
## [81] train-rmse:0.017891+0.000265    test-rmse:0.075423+0.007179 
## [100]    train-rmse:0.010705+0.000262    test-rmse:0.072971+0.007478
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_4 <- xgb.cv(data = dtrain_public, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.01, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.546130+0.009176    test-rmse:0.544945+0.037534 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.450830+0.007398    test-rmse:0.452225+0.032298 
## [41] train-rmse:0.372723+0.005968    test-rmse:0.376796+0.027986 
## [61] train-rmse:0.308653+0.004829    test-rmse:0.315404+0.024344 
## [81] train-rmse:0.255994+0.003891    test-rmse:0.265864+0.020819 
## [100]    train-rmse:0.214652+0.003176    test-rmse:0.227938+0.017899
# Use xgb.cv to run cross-validation inside xgboost
set.seed(111111)
public_bst_mod_5 <- xgb.cv(data = dtrain_public, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.005, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [1]  train-rmse:0.548771+0.009226    test-rmse:0.547516+0.037684 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.498612+0.008292    test-rmse:0.498663+0.034964 
## [41] train-rmse:0.453205+0.007442    test-rmse:0.454545+0.032409 
## [61] train-rmse:0.412086+0.006679    test-rmse:0.414768+0.030110 
## [81] train-rmse:0.374844+0.006005    test-rmse:0.378823+0.028105 
## [100]    train-rmse:0.342718+0.005436    test-rmse:0.347876+0.026375
# Extract results for model with eta = 0.3
pd1 <- cbind.data.frame(public_bst_mod_1$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.3, nrow(public_bst_mod_1$evaluation_log)))
names(pd1)[3] <- "eta"
# Extract results for model with eta = 0.1
pd2 <- cbind.data.frame(public_bst_mod_2$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.1, nrow(public_bst_mod_2$evaluation_log)))
names(pd2)[3] <- "eta"
# Extract results for model with eta = 0.05
pd3 <- cbind.data.frame(public_bst_mod_3$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.05, nrow(public_bst_mod_3$evaluation_log)))
names(pd3)[3] <- "eta"
# Extract results for model with eta = 0.01
pd4 <- cbind.data.frame(public_bst_mod_4$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.01, nrow(public_bst_mod_4$evaluation_log)))
names(pd4)[3] <- "eta"
# Extract results for model with eta = 0.005
pd5 <- cbind.data.frame(public_bst_mod_5$evaluation_log[,c("iter", "test_rmse_mean")], rep(0.005, nrow(public_bst_mod_5$evaluation_log)))
names(pd5)[3] <- "eta"
# Join datasets
plot_data <- rbind.data.frame(pd1, pd2, pd3, pd4, pd5)
# Convert ETA to factor
plot_data$eta <- as.factor(plot_data$eta)
# Plot points
g_4 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_point(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate")  # Set labels
g_4

# Plot lines
g_5 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_smooth(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate")  
g_5
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Plot lines
g_6 <- ggplot(plot_data, aes(x = iter, y = test_rmse_mean, color = eta))+
  geom_smooth(alpha = 0.5) +
  theme_bw() + # Set theme
  theme(panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  labs(x = "Number of Trees", title = "Error Rate v Number of Trees",
       y = "Error Rate", color = "Learning \n Rate")  + # Set labels
      xlim(0, 100) +  # Zoom in on the x-axis (number of trees)
      ylim(0, 0.1)    # Zoom in on the y-axis (error rate)# Set labels
g_6
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 275 rows containing non-finite values (`stat_smooth()`).

public_bst_mod_final <- xgboost(data = dtrain_public, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [23:52:15] WARNING: src/learner.cc:767: 
## Parameters: { "nfold" } are not used.
## 
## [1]  train-rmse:0.498477 
## Will train until train_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.073745 
## [41] train-rmse:0.016763 
## [61] train-rmse:0.007070 
## [81] train-rmse:0.004419 
## [100]    train-rmse:0.002912

2.4.5 Public School Feature Importance

2.4.5.1 All Features

# Set matrices for xgboost analysis for public schools
dtrain_public <- xgb.DMatrix(data = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public <- xgb.DMatrix(data = as.matrix(xg_public_test_df[,c(2:8, 10:51)]), label = as.numeric(xg_public_test_df$rel_apply) - 1)
# Feature importance for private schools
importance_matrix_public <- xgb.importance( model = public_bst_mod_final)

# Print the feature importances
print(importance_matrix_public)
##                           Feature         Gain       Cover   Frequency
##                            <char>        <num>       <num>       <num>
##  1:                 rel_apply_sat 8.795855e-01 0.130952509 0.072534404
##  2:                    rel_attend 5.082076e-02 0.222682181 0.104357798
##  3:      stderr_rel_apply_instate 1.271313e-02 0.014796146 0.013761468
##  4:             rel_apply_oostate 1.108467e-02 0.058169339 0.045011468
##  5:                par_income_bin 1.010362e-02 0.016243945 0.062500000
##  6:              rel_att_cond_app 9.433339e-03 0.090582158 0.070814220
##  7:             rel_apply_instate 4.819239e-03 0.030067743 0.036697248
##  8:      attend_level_instate_sat 4.146900e-03 0.004379145 0.007167431
##  9:              stderr_rel_apply 1.985254e-03 0.021999392 0.030676606
## 10:                 stderr_attend 1.569007e-03 0.009162243 0.030963303
## 11:          stderr_rel_apply_sat 1.423387e-03 0.006524032 0.015768349
## 12:                        attend 1.080127e-03 0.014023987 0.049025229
## 13:  stderr_rel_apply_oostate_sat 8.877320e-04 0.011053318 0.010894495
## 14:                    attend_sat 8.743783e-04 0.009559047 0.031250000
## 15: stderr_rel_attend_oostate_sat 8.553265e-04 0.010642215 0.010321101
## 16:      rel_att_cond_app_instate 7.889163e-04 0.013205355 0.015194954
## 17:                attend_oostate 6.757113e-04 0.014066885 0.013474771
## 18:     stderr_attend_instate_sat 6.683678e-04 0.030643287 0.017775229
## 19:         rel_apply_oostate_sat 6.502065e-04 0.030107066 0.019782110
## 20:                  attend_level 5.358668e-04 0.005837668 0.012327982
## 21:                attend_instate 4.874149e-04 0.009087172 0.018061927
## 22:             stderr_attend_sat 3.853706e-04 0.010499222 0.017775229
## 23:      rel_att_cond_app_oostate 3.491389e-04 0.009980875 0.015481651
## 24:          rel_att_cond_app_sat 3.384530e-04 0.015328793 0.021502294
## 25:         rel_apply_instate_sat 3.328558e-04 0.008154146 0.015481651
## 26:  rel_att_cond_app_instate_sat 3.031087e-04 0.008954904 0.011467890
## 27:         stderr_rel_attend_sat 2.889052e-04 0.006795717 0.010894495
## 28:                rel_attend_sat 2.520679e-04 0.019736536 0.026376147
## 29:              attend_level_sat 2.235196e-04 0.003903695 0.008314220
## 30:     stderr_rel_attend_instate 1.854963e-04 0.008343611 0.008600917
## 31:         stderr_attend_oostate 1.822433e-04 0.009140794 0.013474771
## 32:         stderr_attend_instate 1.715580e-04 0.012912220 0.013188073
## 33:      stderr_rel_apply_oostate 1.661473e-04 0.007117450 0.008314220
## 34:     stderr_rel_attend_oostate 1.613415e-04 0.002906322 0.006594037
## 35:  rel_att_cond_app_oostate_sat 1.588805e-04 0.006441811 0.012901376
## 36:            rel_attend_instate 1.580274e-04 0.012286628 0.010894495
## 37: stderr_rel_attend_instate_sat 1.543750e-04 0.003335299 0.005447248
## 38:  stderr_rel_apply_instate_sat 1.498414e-04 0.015418164 0.011467890
## 39:     stderr_attend_oostate_sat 1.497366e-04 0.007088852 0.012901376
## 40:          attend_level_instate 1.261494e-04 0.004336247 0.004300459
## 41:            attend_oostate_sat 1.147398e-04 0.004257601 0.009747706
## 42:        rel_attend_oostate_sat 1.080170e-04 0.007160348 0.012614679
## 43:        rel_attend_instate_sat 8.997062e-05 0.006992332 0.012614679
## 44:      attend_level_oostate_sat 7.282942e-05 0.006255921 0.004013761
## 45:          attend_level_oostate 6.523056e-05 0.007471357 0.004587156
## 46:             stderr_rel_attend 4.904695e-05 0.004071711 0.008887615
## 47:            attend_instate_sat 4.726513e-05 0.010610042 0.010321101
## 48:            rel_attend_oostate 2.448261e-05 0.019354031 0.011467890
## 49:                          tier 2.394083e-06 0.007360538 0.002006881
##                           Feature         Gain       Cover   Frequency
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_public)

From this graph, I’m taking everything from rel_apply_instate below and printing the bottom 10 below that when it comes to the next section, “Detailed Features”

source("~/Downloads/a_insights_shap_functions.r")

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = public_bst_mod_final, 
                X_train =as.matrix(xg_public_train_df[,c(2:8, 10:51)]),
                shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)

shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(xg_public_train_df[,c(2:8, 10:51)]), 
                           top_n = 10)


plot.shap.summary(data_long = shap_long)

# Check feature names in the model
model_feature_names <- colnames(public_bst_mod_final$data)

# Check feature names in the new data
data_feature_names <- colnames(xg_public_train_df[,c(2:8, 10:51)])

# Compare
setdiff(model_feature_names, data_feature_names)  # Features in the model but not in the new data
## NULL
setdiff(data_feature_names, model_feature_names)  # Features in the new data but not in the model
##  [1] "par_income_bin"                "attend"                       
##  [3] "stderr_attend"                 "attend_level"                 
##  [5] "attend_sat"                    "stderr_attend_sat"            
##  [7] "attend_level_sat"              "stderr_rel_apply"             
##  [9] "rel_attend"                    "stderr_rel_attend"            
## [11] "rel_att_cond_app"              "rel_apply_sat"                
## [13] "stderr_rel_apply_sat"          "rel_attend_sat"               
## [15] "stderr_rel_attend_sat"         "rel_att_cond_app_sat"         
## [17] "attend_instate"                "stderr_attend_instate"        
## [19] "attend_level_instate"          "attend_instate_sat"           
## [21] "stderr_attend_instate_sat"     "attend_level_instate_sat"     
## [23] "attend_oostate"                "stderr_attend_oostate"        
## [25] "attend_level_oostate"          "attend_oostate_sat"           
## [27] "stderr_attend_oostate_sat"     "attend_level_oostate_sat"     
## [29] "rel_apply_instate"             "stderr_rel_apply_instate"     
## [31] "rel_attend_instate"            "stderr_rel_attend_instate"    
## [33] "rel_att_cond_app_instate"      "rel_apply_oostate"            
## [35] "stderr_rel_apply_oostate"      "rel_attend_oostate"           
## [37] "stderr_rel_attend_oostate"     "rel_att_cond_app_oostate"     
## [39] "rel_apply_instate_sat"         "stderr_rel_apply_instate_sat" 
## [41] "rel_attend_instate_sat"        "stderr_rel_attend_instate_sat"
## [43] "rel_att_cond_app_instate_sat"  "rel_apply_oostate_sat"        
## [45] "stderr_rel_apply_oostate_sat"  "rel_attend_oostate_sat"       
## [47] "stderr_rel_attend_oostate_sat" "rel_att_cond_app_oostate_sat" 
## [49] "tier"

2.4.5.2 Detailed Features

# Set matrices for xgboost analysis for public schools
dtrain_public2 <- xgb.DMatrix(data = as.matrix(xg_public_train_df[, c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]), label = as.numeric(xg_public_train_df$rel_apply) -1)
dtest_public2 <- xgb.DMatrix(data = as.matrix(xg_public_test_df[, c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]), label = as.numeric(xg_public_test_df$rel_apply) -1)
public_bst_mod_final2 <- xgboost(data = dtrain_public2, # Set training data
              
              nfold = 5, # Use 5 fold cross-validation
               
              eta = 0.1, # Set learning rate
              
              nrounds = 100, # Set number of rounds
              early_stopping_rounds = 100, # Set number of rounds to stop at if there is no improvement
               
              verbose = 1, # 1 - Prints out fit
              nthread = 1, # Set number of parallel threads
              print_every_n = 20, # Prints out result every 20th iteration
              
              objective = "reg:squarederror",  # Change this to regression
              eval_metric = "rmse")
## [23:52:16] WARNING: src/learner.cc:767: 
## Parameters: { "nfold" } are not used.
## 
## [1]  train-rmse:0.502742 
## Will train until train_rmse hasn't improved in 100 rounds.
## 
## [21] train-rmse:0.106989 
## [41] train-rmse:0.048002 
## [61] train-rmse:0.032751 
## [81] train-rmse:0.024787 
## [100]    train-rmse:0.019116
# Feature importance for public schools
importance_matrix_public <- xgb.importance( model = public_bst_mod_final2)

# Print the feature importances
print(importance_matrix_public)
##                          Feature       Gain      Cover  Frequency
##                           <char>      <num>      <num>      <num>
##  1:        stderr_rel_attend_sat 0.45761309 0.17337198 0.10364842
##  2:                   attend_sat 0.20058436 0.15060523 0.20370370
##  3:             stderr_rel_apply 0.07623884 0.10366050 0.11083472
##  4: stderr_rel_apply_oostate_sat 0.06267240 0.06919137 0.06688778
##  5:     stderr_rel_apply_oostate 0.04962404 0.04335253 0.08761747
##  6:     stderr_rel_apply_instate 0.04205569 0.12570604 0.12907684
##  7:            stderr_attend_sat 0.03500050 0.10029369 0.08430072
##  8:           attend_instate_sat 0.03289434 0.11217983 0.09894970
##  9: stderr_rel_apply_instate_sat 0.03181897 0.08745401 0.07877280
## 10:         attend_level_oostate 0.01149776 0.03418480 0.03620785
# Plot of feature importance for PRIVATE
xgb.plot.importance(importance_matrix_public)

source("~/Downloads/a_insights_shap_functions.r")

# Calculate SHAP importance
shap_result <- shap.score.rank(xgb_model = public_bst_mod_final2, 
                X_train =as.matrix(xg_public_train_df[,c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]),
                shap_approx = F)
## make SHAP score by decreasing order
# Plot SHAP importance
var_importance(shap_result, top_n=10)

shap_long = shap.prep(shap = shap_result,
                           X_train = as.matrix(xg_public_train_df[,c(6, 32, 10, 37, 47, 7, 17, 27, 22, 42)]), 
                           top_n = 10)


plot.shap.summary(data_long = shap_long)

3 Predict and calculate RSME

# Private_bst_1 is our trained xgb model for priv school data, predict values for RMSE calculations
preds_private <- predict(private_bst_mod_final, dtest_private)

# Actual values are xg_private_test_df
actual_private <- as.numeric(xg_private_test_df$rel_apply) - 1

# Calculate RMSE
rmse_private <- rmse(actual_private, preds_private)

# Print RMSE
print(paste("Private Model RMSE:", rmse_private))
## [1] "Private Model RMSE: 0.0837600179618872"
# Public_bst_1 is our trained xgb model for public school data, predict values for RMSE calculations
preds_public <- predict(public_bst_mod_final, dtest_public)

# Actual values are xg_public_test_df
actual_public <- as.numeric(xg_public_test_df$rel_apply) - 1

# RMSE for public schools
rmse_public <- rmse(actual_public, preds_public)

# Print RMSE
print(paste("Public Model RMSE:", rmse_public))
## [1] "Public Model RMSE: 0.0830721303856991"

4 Vizualize predicted versus actual

# Actual vs. predicted visualizations for private
plot(actual_private, preds_private, main = "Private Model: Predicted vs Actual",
     xlab = "Actual Values", ylab = "Predicted Values",
     col="blue", pch=19)
abline(0, 1, col="red")  # 45 deg. reference line

# Plot actual vs. predicted for public model
plot(actual_public, preds_public, main="Public Model: Predicted vs Actual",
     xlab="Actual Values", ylab="Predicted Values",
     col="blue", pch=19)
abline(0, 1, col="red")  # 45 degree reference line